"""
安装依赖
pip install lxml

"""
# 导入模块
import requests
from bs4 import BeautifulSoup
import os
import re

# 请求头信息
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

BASEURL = "https://m.20xs.org"


# 获取网页源码
def get_html(url):
    res = requests.get(BASEURL + url, headers=headers)
    return res.text


# 解析数据
def parse_html_get_all_page(html):
    soup = BeautifulSoup(html, 'lxml')  # 实例化
    # 解析分页全部章节
    tag_div = soup.find('div', class_='pagelist')
    tag_option = tag_div.find_all("option")
    result = []
    for tag in tag_option:
        print(tag.get('value'))
        result.append(tag.get('value'))

    return result


def parse_html_get_all_catalogue(page_list):
    """
    获取全部章节目录
    :param pageList:
    :return:
    """
    result = []
    for page_url in page_list:
        html = get_html(page_url)
        soup = BeautifulSoup(html, 'lxml')
        tag_div = soup.find('ul', class_='read')
        a_option = tag_div.find_all("a")
        for tag in a_option:
            result.append({
                "name": tag.get_text(),
                "url": tag.get('href')
            })

    return result


def parse_html_get_all_catalogue_content(page_list):
    """
    获取全部章节的内容
    :param pageList:
    :return:
    """

    def _getContentByUrl(url):
        content = ""
        html = get_html(url)
        soup = BeautifulSoup(html, 'lxml')
        tag_div = soup.find('div', class_='content')
        tag_p = tag_div.find_all("p")
        for tag in tag_p:
            content += tag.get_text() + "\n"
        return content
    result = []


    for page_obj in page_list:
        result.append(page_obj['name'] + "\n")
        #追加当页内容
        result.append(_getContentByUrl(page_obj['url']))

        # 如果存在下一页，继续追加内容
        html = get_html(page_obj['url'])
        soup = BeautifulSoup(html, 'lxml')
        tag_div = soup.find('div', class_='pager')
        a_option = tag_div.find_all("a")

        for tag in a_option:
            if("下一页" == tag.get_text()):
                result.append(_getContentByUrl(tag.get('href')))


    return result


# 主函数
def main():
    url = '/245800_1/'
    # html = get_html(url)
    # 获取全部 页码
    # all_page_url = parse_html_get_all_page(html)

    all_page_url = ["/245800_18/"]

    # 获取单个页码下的章节
    all_chapter_url = parse_html_get_all_catalogue(all_page_url)

    # all_chapter_url = [{'name': '第1章 木偶剧场的黑魔法', 'url': '/245800/95349381.html'}]

    # 获取某一章的所有内容
    all_chapter_Content = parse_html_get_all_catalogue_content(all_chapter_url)

    for content in all_chapter_Content :
        with open('./法师禁咒.txt','a+',encoding='utf-8') as fp:
            fp.write(content)


    print("完成")



main()
